# For data prep
import numpy as np
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
# For PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from scipy.spatial.distance import euclidean
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import math
import bisect
# For Clustering
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster
from scipy.cluster import hierarchy
# For plotting
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotnine import *
# For writeup
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').hide();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value=""></form>''')
def drop_features(data, irrelevant_cols, skip_cols, p):
"""
Drop features given a variance threshold
Parameters
----------
data : dataframe
dataframe from which the columns will be dropped. May contain
a mix of numeric and categorical columns. By default, categorical
columns are label-encoded before getting the variance of the columns.
irrelevant_cols : list
list of column names. Column names in this list will be dropped
permanently.
skip_cols : list
list of column names. Column names in this list will be kept by
default and will not undergo variance thresholding.
p : float
variance threshold for dropping. A p = 0.01 would mean dropping
columns where 99% of the values are similar.
Returns
-------
dataframe in its original format, whose columns that did not
satisfy the p threshold were dropped
"""
orig_data = data.copy(deep=True)
if irrelevant_cols is not None:
#print(f"Dropping irrelevant columns:\n{irrelevant_cols}")
data = orig_data.drop(irrelevant_cols, axis=1)
if skip_cols is not None:
#print(f"\nSkipping the following columns:\n{skip_cols}")
data = data.drop(skip_cols, axis=1)
num_data = pd.DataFrame(data.select_dtypes(include=np.number))
cat_data = pd.DataFrame(data.select_dtypes(exclude=np.number))
if cat_data.shape[1] != 0:
cat_data = cat_data.apply(LabelEncoder().fit_transform)
transformed_data = pd.concat([num_data, cat_data], axis=1)
else:
transformed_data = num_data
# threshold=0.01 means dropping the column where 99% of values are similar.
thresh = VarianceThreshold(threshold=p)
thresh.fit_transform(transformed_data)
ind = thresh.get_support(indices=True)
cols_kept = list(transformed_data.columns[ind])
cols_dropped = list(set(transformed_data.columns)-set(cols_kept))
# print(f"\nGiven threshold = {p}, the following columns"
# f" were dropped:\n{cols_dropped}")
if skip_cols is not None:
return orig_data[skip_cols + cols_kept]
return orig_data[cols_kept]
def fix_dtypes(data):
"""Fix datatypes of the dataframe
Parameters
----------
data : dataframe
dataframe from which the columns will be appropriately
converted into their correct data types. May contain a mix of numeric
and categorical columns.
"""
df = data.copy(deep=True)
df = df.convert_dtypes()
for i in df.columns:
if '_dt' in i:
df[i] = pd.to_datetime(df[i], errors='coerce')
return df
def truncated_svd(X, thresh=0.90):
"""Perform singular value decomposition on a design matrix X"""
q, s, p = np.linalg.svd(X, full_matrices=True)
Q = q
S = np.diag(s)
P = p.T
NSSD = (s / np.sqrt(np.sum(s**2)))**2
ind = bisect.bisect(NSSD.cumsum(), thresh) +1
# print(f'\nTo achieve at least {thresh*100}%, {ind} PCs are required.')
return Q, S, P, NSSD
def project_svd(q, s, k):
"""Project the design matrix on to the first k singular vectors"""
return q[:,:k].dot(s[:k,:k])
def plot_var(variance_explained, num_components):
"""
Plot the individual and cumulative variance explained of the resulting PCs.
Parameters
----------
variance_explained : array
Eigenvalues of the PCs.
num_components : int or float
Number of components to keep; to be imposed on with a horizontal
dashed line.
species_str : str
String to be placed in the plot title.
"""
var_ind = pd.DataFrame(zip(range(1, len(variance_explained)+1),
['Individual']*len(variance_explained),
variance_explained),
columns=['SV', 'Contribution', 'Value'])
var_cum = pd.DataFrame(zip(range(1, len(variance_explained)+1),
['Cumulative']*len(variance_explained),
variance_explained.cumsum()),
columns=['SV', 'Contribution', 'Value'])
var = pd.concat([var_ind, var_cum], axis=0)
value = var['Value']*100
var['Contribution %'] = (value).round(2).astype(str) + '%'
var['Value'] = var['Value']*100
fig = px.line(var, x='SV', y='Value', color='Contribution',
hover_data={'SV': False,
'Contribution': False,
'Value': False,
'Contribution %': True},
markers=True, color_discrete_map={'Individual': '#8F0A1B',
'Cumulative': '#104C6B'})
fig.update_layout(
xaxis_title="<b>Number of SV</b>",
yaxis_title="<b>Variance Explained</b>",
title={'text': f'<b>Chosen SVs to '
f'Reach 90% Cumulative Variance Explained</b>',
'xanchor': 'center',
'yanchor': 'top',
'x': 0.5},
margin=dict(l=0, r=0, t=100, b=0),
font={'size': 14, 'color': '#333136'},
plot_bgcolor='white',
yaxis_range=[0, 100],
hovermode="x",
height=500,
width=800)
fig.add_shape(
type='line',
yref='paper',
y0=0,
y1=1,
xref='x',
x0=num_components,
x1=num_components,
line=dict(
color='#333136',
dash='dash',
width=2))
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#EEF4ED')
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01,
title='<b>Contribution</b>'
))
return fig
proj_theme = theme(strip_background = element_rect(fill="white"), \
legend_title = element_blank(), \
legend_text = element_text(size = 9), \
legend_position = (0.5, 0.93), \
legend_direction = "horizontal", \
legend_box_background = element_rect(fill = "transparent", color = "transparent"), \
axis_text_x = element_text(size=11, color = "#722502"), \
axis_text_y = element_text(size=11, color = "#722502", face="bold"), \
text = element_text(family = "sans", color = "#722502", size = 18), \
axis_title_x = element_text(color='#DA4D2E', size=12, face="bold", margin = {'t':15}), \
axis_title_y = element_text(color='#DA4D2E', size=12, face="bold", margin = {'r':15}), \
plot_title = element_text(hjust = 0.5, size = 14, face="bold", margin = {'b':33}), \
panel_spacing = 0.5, \
panel_background = element_rect(fill="white", color ="#722502"), \
plot_background = element_rect(fill="white"),
panel_grid_major_x = element_line(colour="white"), \
panel_grid_major_y = element_line(colour="#cfa544", linetype="dashed"), \
panel_grid_minor=element_blank(),
strip_text_x = element_text(size = 14, hjust = 0.5, color = "#722502", face="bold"))
proj_theme2 = theme(strip_background = element_rect(fill="white"), \
legend_title = element_blank(), \
legend_text = element_text(size = 9), \
legend_position = (0.5, 0.93), \
legend_direction = "horizontal", \
legend_box_background = element_rect(fill = "transparent", color = "transparent"), \
axis_text_x = element_text(size=10, color = "#722502"), \
axis_text_y = element_text(size=10, color = "#722502"), \
text = element_text(family = "sans", color = "#722502", size = 18), \
axis_title_x = element_text(color='#DA4D2E', size=12, face="bold", margin = {'t':15}), \
axis_title_y = element_text(color='#DA4D2E', size=12, face="bold", margin = {'r':15}), \
plot_title = element_text(hjust = 0.5, size = 14, face="bold", margin = {'b':33}), \
panel_spacing = 0.5, \
panel_background = element_rect(fill="#F7E2B1", color ="#722502"), \
plot_background = element_rect(fill="#F7E2B1"),
panel_grid_major_x = element_line(colour="#F7E2B1"), \
panel_grid_major_y = element_line(colour="#F7E2B1", linetype="dashed"), \
panel_grid_minor=element_blank(),
axis_line = element_line(colour = "#722502"),
strip_text_x = element_text(size = 14, hjust = 0.5, color = "#722502", face="bold"))
proj_theme3 = theme(strip_background = element_rect(fill="white"), \
legend_title = element_blank(), \
legend_text = element_text(size = 9), \
legend_position = (0.5, 0.93), \
legend_direction = "horizontal", \
legend_box_background = element_rect(fill = "transparent", color = "transparent"), \
axis_text_x = element_text(size=16, color = "#722502"), \
axis_text_y = element_text(size=16, color = "#722502"), \
text = element_text(family = "sans", color = "#722502", size = 22), \
axis_title_x = element_text(color='#DA4D2E', size=12, face="bold", margin = {'t':15}), \
axis_title_y = element_text(color='#DA4D2E', size=12, face="bold", margin = {'r':15}), \
plot_title = element_text(hjust = 0.5, size = 14, face="bold", margin = {'b':33}), \
panel_spacing = 0.5, \
panel_background = element_rect(fill="#F7E2B1", color ="#722502"), \
plot_background = element_rect(fill="#F7E2B1"),
panel_grid_major_x = element_line(colour="#F7E2B1"), \
panel_grid_major_y = element_line(colour="#F7E2B1", linetype="dashed"), \
panel_grid_minor=element_blank(),
axis_line = element_line(colour = "#722502"),
strip_text_x = element_text(size = 14, hjust = 0.5, color = "#722502", face="bold"))
def plot_svd_ulam(data, num_comp, num_ing, fill_fn, manual_fill_values):
pc = data.iloc[:,(num_comp-1)].reset_index()
pc.columns = ['ing', 'loading']
pc['abs_loading'] = np.abs(pc['loading'])
pc = pc.sort_values('abs_loading', ascending=False).head(num_ing)
pc['ing'] = pc['ing'].str.replace('_', ' ').str.title()
pc['ing']=pd.Categorical(pc['ing'], categories=pc['ing'], ordered=True)
pc['ing_color'] = fill_fn(pc['loading'])
p = (ggplot(pc, aes(x='ing', y='loading')) +
geom_bar(aes(fill='ing_color'), stat='identity', show_legend=False) +
scale_fill_manual(values=manual_fill_values) +
coord_flip() +
xlab('') + ylab('') +
proj_theme
);
return p
def final_clustering(transformed_arr,
orig_df,
method,
threshold,
plot_threshold,
break_biggest_cluster = False,
link_colors=['#5594BA',
'#EFC564',
'#DA4D2E',
'#722502',
'#EFC564',
'#B3C55A']):
Z = hierarchy.linkage(transformed_arr, method=method, optimal_ordering=True)
hierarchy.set_link_color_palette(link_colors)
y_pred = fcluster(Z, t=threshold, criterion='distance')
if break_biggest_cluster:
X_cluster = orig_df.copy()
X_cluster['cluster'] = y_pred
get_big_cluster = X_cluster.groupby(['cluster']).size().idxmax()
X_1 = X_cluster[X_cluster.cluster == get_big_cluster].drop(columns='cluster')
Z_1 = linkage(X_1, method='ward', optimal_ordering=True)
Z = Z_1
# fig, ax = plt.subplots(figsize=(10,6), dpi=150)
# fig.patch.set_facecolor('#F7E2B1')
# fig.patch.set_alpha(0.6)
# ax.patch.set_facecolor('#F7E2B1')
# ax.patch.set_alpha(0.0)
# ax.spines['bottom'].set_color('#722502')
# ax.spines['top'].set_color('#722502')
# ax.spines['right'].set_color('#722502')
# ax.spines['left'].set_color('#722502')
# ax.tick_params(axis='x', colors='#722502')
# ax.tick_params(axis='y', colors='#722502')
# ax.set_ylabel(r'$\Delta$')
#dn = dendrogram(Z, ax=ax, p=plot_threshold, truncate_mode='level')
#fig.show()
return Z
with create_engine('sqlite:///ulam.db').connect() as conn:
df_ing = pd.read_sql("""
SELECT * FROM rekado
""", conn)
fixed_df = fix_dtypes(df_ing)
dropped_df = drop_features(df_ing, irrelevant_cols=['dish_name'], skip_cols=None, p=0)
X = np.array(dropped_df, dtype=float)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
q, s, p, nssd = truncated_svd(X)
X_new = project_svd(q, s, 60)
feature_names = dropped_df.columns
weights_df = pd.DataFrame(p, index=feature_names)
weights_df.columns = ['SV'+str(c+1) for c in weights_df.columns]
weights_df = weights_df.iloc[:,:60]
def custom_color(x):
if x < 0:
return '#722502'
if x > 0:
return '#DA4D2E'
p = plot_svd_ulam(data=weights_df,
num_comp=2,
num_ing=8,
fill_fn=np.vectorize(custom_color),
manual_fill_values=['#722502', '#DA4D2E'])
p.save(filename = "adobo vs afritada.png", width=7.5, height=5, units='in')
def custom_color(x):
if x < 0:
return '#5E6A25'
if x > 0:
return '#8D9F38'
func = np.vectorize(custom_color)
p = plot_svd_ulam(data=weights_df,
num_comp=5,
num_ing=8,
fill_fn=np.vectorize(custom_color),
manual_fill_values=['#F4903E', '#8D9F38'])
p.save(filename = "kare-kare vs sinigang.png", width=7.5, height=5, units='in')
orig_df = pd.DataFrame(X, index=df_ing.dish_name, columns=dropped_df.columns)
Z = final_clustering(X_new, orig_df, 'ward', 4.5, 5, False,
link_colors=['#1F77B4',
'#EFC564',
'#DA4D2E',
'#722502',
'#F4903E',
'#B3C55A'])
y_pred = fcluster(Z, t=4.5, criterion='distance')
X_cluster = orig_df.assign(cluster=y_pred)
Z_1 = final_clustering(X_new, orig_df, 'ward', 4.5, 6, break_biggest_cluster=True,
link_colors=['#FA6961',
'#FA6961',
'#EFC564',
'#CC9500',
'#CC9500',
'#1F77B4',
'#1F77B4'])
y_pred_1 = fcluster(Z_1, t=3.7, criterion='distance')
get_big_cluster = X_cluster.groupby(['cluster']).size().idxmax()
X_cluster_1 = X_cluster[X_cluster.cluster == get_big_cluster].drop(columns='cluster')
X_cluster_1['cluster'] = y_pred_1 + 6
X_cluster.loc[X_cluster_1.index] = X_cluster_1
renumber_cluster = dict(zip(sorted(X_cluster['cluster'].unique()), range(9)))
X_cluster['cluster_no'] = X_cluster['cluster'].map(renumber_cluster).astype(str)
X_new_df = pd.DataFrame(X_new)
X_new_df.columns = [f'SV{i+1}' for i in X_new_df.columns]
X_new_df['cluster_no'] = list(X_cluster['cluster_no'])
Alam mo ba kung ano ang pinagkaiba ng Caldereta, Menudo, Mechado, at Afritada 🥘? Hindi? Kami rin!
Inspired by the ambiguity surrounding the Menudo-Afritada-Caldereta-Mechado debate, our team performed dimensionality reduction 🔨 and clustering methods ⚙️ on Panlasang Pinoy's recipes to cluster our very own Filipino dishes from an ingredients-list standpoint.
There are hundreds of dishes in the Philippines ☀️⭐, each prepared uniquely depending on the place of origin. The rich and unique flavor that our food has is shaped by the own richness of our history. In the food business, the term fusion is reserved for a curious and intriguing combination of dishes and cooking techniques from different cuisines that somehow work together. Our group, however, would call the Filipino Cuisine a fusion in itself because of how it has combined different foreign cooking styles and recipes with our local techniques and native ingredients.
Using the recipes from Panlasang Pinoy website 🌐, our goal in this study to be able to cluster similar dishes, to pick out the ingredients that define them, and to identify spaces for both innovation and reinvention in our traditional recipes. Join us in our data-driven approach in exploring the diversity, creativeness, and boldness of the Filipino Cuisine!
Are you excited to know what's in your ulam, pare?
Pinoy dishes are known for their variety -- from the ingredients used down to the techniques. However, no matter how varied our dishes are, we can still uncover underlying patterns in our dishes that, in a way, can be deemed as conventional. "Nakasanayan na," kumabaga.
These underlying patterns can be mined using dimensionality reduction techniques 🔨 Instead of looking at the ingredients of our dishes one by one, we can look at them based on their tendencies to group or separate. For example, the following plots below show how ingredients are used together in Pinoy dishes. There are vegetables 🥕🥬 and condiments 🧂 that usually go together -- like soy sauce con and bay leaf -- while there are those whose combinations are still unheard of.
Can you identify dishes that have the following combinations of ingredients?
Soy Sauce, Garlic, and Bay Leaf go together, while Potato, Carrot, and Bell Pepper go together.
Thinking of Adobo when you read soy sauce and bay leaf? Actually, soy sauce and bay leaf also appears in dishes like Lechon Paksiw and Humba. And we bet that your first thoughts about potato, carrot, and bell pepper, differ from another reader's! Yes, ingredients on the right side of the plot are those that are found in varying proportions across the widely-debated Caldereta, Menudo, Mechado, and Afritada
Garlic, Peanut, and Bagoong go together, while Okra and Kangkong go together.
Now, this is an interesting one! Can you even think of other Pinoy dishes, aside from Kare-Kare, that use garlic, peanut, and bagoong together? How about Pinoy dishes aside from Sinigang that have okra, sinigang mix, and kangkong together? This shows that some Pinoy dishes are so special that the combination of their ingredients could be unique only to them.
Now, using the above information, we clustered ⚙️ the dishes together based on the tendencies of their ingredients:
You can answer these and more using the interactive plot below!
Filipino dishes were clustered into 7 main groups based on their taste and flavor profile, consistency, and vegetable content.
Each point in the scatter plot represents a Filipino dish. The different colors represent the different groupings or clusters of the dishes based on similarity of ingredients. The horizontal position indicates indicates the kinds of vegetables in the dish (from leafy vegetables to carrots and potatoes), while the vertical position indicates the dominating flavor of the dish (from nutty, sweet, salty, and sour).
User Guide:
def custom_color(x):
if x == '0':
return '#DA4D2E' # carrot + patatas, ma-sarsa
elif x == '3':
return '#F4903E' # dahon, ma-sarsa
elif x == '4':
return '#8D9F38' # dahon, ma-sabaw
elif x == '6':
return '#EFC564' # carrot + patatas, ma-sabaw
elif x == '2':
return '#722502' # suka + toyo, maalat
elif x == '7':
return '#cc9500' # suka + toyo, matamis
elif x == '5':
return '#FA6961' # tomato-based, matamis
else:
return 'rgba(249, 248, 252, 0.8)'
func = np.vectorize(custom_color)
X_new_df['plotly_colors'] = func(X_new_df['cluster_no'])
def custom_names(x):
if x == '0':
return 'Saucy Crops' # carrot + patatas, ma-sarsa
elif x == '3':
return 'Nutty Sauce' # dahon, ma-sarsa
elif x == '4':
return 'Sour Soup' # dahon, ma-sabaw
elif x == '6':
return 'Salty Soup' # carrot + patatas, ma-sabaw
elif x == '2':
return 'Soy Vinegar' # suka + toyo, maalat
elif x == '7':
return 'Soy Sweet' # suka + toyo, matamis
elif x == '5':
return 'Red Sweet' # tomato-based, matamis
else:
return 'Others'
func = np.vectorize(custom_names)
X_new_df['cluster_names'] = func(X_new_df['cluster_no'])
X_new_df['cluster_names'] = pd.Categorical(X_new_df['cluster_names'], ['Saucy Crops', 'Nutty Sauce', 'Sour Soup', 'Salty Soup', 'Soy Vinegar', 'Soy Sweet', 'Red Sweet', 'Others'], ordered=True)
X_new_df['Dish Name'] = X_cluster.index
dict_df = X_new_df[['cluster_names', 'plotly_colors']].drop_duplicates()
cmap = dict(zip(dict_df['cluster_names'], dict_df['plotly_colors']))
data = X_new_df
x = 'SV2'
y = 'SV5'
c = 'cluster_names'
tooltip = ['Dish Name']
y_ref = 0
x_ref = 0
fig = px.scatter(data.sort_values('cluster_names'),
x=x,
y=y,
color=c,
hover_data=tooltip,
labels={
x: '<b>Vegetable Range</b>',
y: '<b>Flavor Range</b>',
c: '<b>Cluster</b>',
'Dish Name': '<b>Dish Name</b>'},
color_discrete_map=cmap,
width=1000, height=800)
fig.update_traces(marker={'size': 14})
fig.for_each_trace(lambda t: t.update(textfont_color='#99D072',
textposition='top right'))
fig.add_shape(
yref='y',
y0=y_ref,
y1=y_ref, # adding a horizontal line at Y = 1
xref='paper',
x0=0,
x1=1,
line=dict(
color='rgba(114, 37, 2, 0.5)',
width=2.5)
)
fig.add_shape(
type='line',
yref='paper',
y0=0,
y1=1,
xref='x',
x0=x_ref,
x1=x_ref,
line=dict(
color='rgba(114, 37, 2, 0.5)',
width=2.5)
)
fig.update_layout(
# title={'text': f"<b>What's in your ulam, pare?</b>",
# 'xanchor': 'center',
# 'yanchor': 'top',
# 'x': 0.5},
margin=dict(l=0, r=0, t=0, b=0),
font={'size': 20, 'color': '#722502'},
plot_bgcolor='white',
xaxis_range=[-1.3, 1.3],
yaxis_range=[-1.3, 1.3],
paper_bgcolor='white'
)
fig.update_xaxes(showline=True, linewidth=2, linecolor='#722502', gridcolor='#F7E2B1')
fig.update_yaxes(showline=True, linewidth=2, linecolor='#722502', gridcolor='#F7E2B1')
fig.show()
Which dishes usually have soy sauce and garlic?
Soy sauce and garlic can be in forms of ulam (adobo, humba, etc.), but also in forms of sawsawan. This is why deep-fried goodness Crispy Pata and the like are lumped together with Adobo.
re there other dishes that are likely to join the feud between Afritada, Kaldereta, and Mechado?
Our findings show that under MASARSA + POTATO AND CARROTS, apart from these tomato-based dishes, we also have gata-based dishes such as Curry and Igado.
Are we sure that no other dishes, aside from Kare-Kare, that use peanut and bagoong? Is Sinigang so unique that the algorithm will not be able to find other similar dishes?
Based on our scraped data from Panlasang Pinoy, there are no other dishes aside from Kare-Kare that use peanut and bagoong together. In fact, in terms of innovation, Kare-Kare is doing poorer than its close counterpart, Sinigang, who have started releasing variants such as Sinigang sa Watermelon.
Were you able to derive other findings in your exploration? Tell us about them once you meet us and watch our presentation this coming Thursday, September 9! Zoom link will be sent to you after filling up this registration form: https://go.aim.edu/dmw2021 Kita-kits!